{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "naughty-killing",
   "metadata": {},
   "source": [
    "# STSI demo using Landsat-8 and Sentinel-2 metadata\n",
    "# STSI used temporal slicing index\n",
    "## This demo could be used to find Spatio-temporal intersection between the two datasets and the potential area of interest at hour level"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "muslim-nitrogen",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "b7aea497-e67e-4192-95e9-b7d85188ccd0",
   "metadata": {},
   "source": [
    "## Input date range for query, we have dataset collected in 2020 in this demo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "present-alcohol",
   "metadata": {},
   "outputs": [],
   "source": [
    "# date in the format of \"yyy-MM-dd\", results include both start date and end date\n",
    "\n",
    "# The hour delay variable represents how many hours apart the two datasets are considered to be temporally intersected\n",
    "# E.g. if hour_delay = 6, for 2020-06-26 12:00:00, timestamps between 2020-06-26 06:00:00 and 2020-06-26 18:00:00 are considered to be temporally intersected\n",
    "start_date = '2018-01-01'\n",
    "end_date = '2018-03-31'\n",
    "duration_month = 3 # for export\n",
    "time_index_hour_length = 24\n",
    "hour_delay = 6"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "liquid-marketing",
   "metadata": {},
   "source": [
    "## please input the potential area of interest, we have \"Beaufort_Sea\" and \"Wandel_Sea\" in in this demo\n",
    "## if not using AOIs, could leave blank"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "gothic-coordinator",
   "metadata": {},
   "outputs": [],
   "source": [
    "Paoi = 'Wandel_Sea'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "encouraging-denmark",
   "metadata": {},
   "source": [
    "## First, setup the Spark and Sedona environment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "billion-testament",
   "metadata": {},
   "outputs": [],
   "source": [
    "import findspark\n",
    "#findspark.init() "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "owned-field",
   "metadata": {},
   "outputs": [],
   "source": [
    "SPARK_HOME='/opt/cloudera/parcels/CDH/lib/spark'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "present-budget",
   "metadata": {},
   "outputs": [],
   "source": [
    "findspark.init(SPARK_HOME)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "deadly-scientist",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/zhangp/.conda/envs/py37_environment/lib/python3.7/site-packages/geopandas/_compat.py:58: UserWarning: The installed version of PyGEOS is too old (0.6 installed, 0.8 required), and thus GeoPandas will not use PyGEOS.\n",
      "  UserWarning,\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import os\n",
    "import codecs\n",
    "import subprocess\n",
    "#from hdfs import InsecureClient\n",
    "import numpy as np\n",
    "#from pyspark import SparkContext\n",
    "from pyspark import SQLContext\n",
    "from pyspark.sql import Row\n",
    "from pyspark.sql import functions as F\n",
    "from pyspark.sql.types import *\n",
    "import rtree\n",
    "from pyspark.sql import Window\n",
    "#import igraph\n",
    "#from igraph import Graph\n",
    "import geofeather"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "regular-identity",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "from pyspark import StorageLevel\n",
    "import geopandas as gpd\n",
    "import pandas as pd\n",
    "from pyspark.sql.types import StructType\n",
    "from pyspark.sql.types import StructField\n",
    "from pyspark.sql.types import StringType\n",
    "from pyspark.sql.types import LongType\n",
    "from shapely.geometry import Point\n",
    "from shapely.geometry import Polygon\n",
    "\n",
    "from sedona.register import SedonaRegistrator\n",
    "from sedona.core.SpatialRDD import SpatialRDD\n",
    "from sedona.core.SpatialRDD import PointRDD\n",
    "from sedona.core.SpatialRDD import PolygonRDD\n",
    "from sedona.core.SpatialRDD import LineStringRDD\n",
    "from sedona.core.enums import FileDataSplitter\n",
    "from sedona.utils.adapter import Adapter\n",
    "from sedona.core.spatialOperator import KNNQuery\n",
    "from sedona.core.spatialOperator import JoinQuery\n",
    "from sedona.core.spatialOperator import JoinQueryRaw\n",
    "from sedona.core.spatialOperator import RangeQuery\n",
    "from sedona.core.spatialOperator import RangeQueryRaw\n",
    "from sedona.core.formatMapper.shapefileParser import ShapefileReader\n",
    "from sedona.core.formatMapper import WkbReader\n",
    "from sedona.core.formatMapper import WktReader\n",
    "from sedona.core.formatMapper import GeoJsonReader\n",
    "from sedona.sql.types import GeometryType\n",
    "from sedona.core.enums import GridType\n",
    "from sedona.core.SpatialRDD import RectangleRDD\n",
    "from sedona.core.enums import IndexType\n",
    "from sedona.core.geom.envelope import Envelope\n",
    "from sedona.utils import SedonaKryoRegistrator, KryoSerializer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "judicial-midwest",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.environ['PYSPARK_PYTHON'] = \"./environment/bin/python\"\n",
    "os.environ['YARN_CONF_DIR'] = \"/opt/cloudera/parcels/CDH/lib/spark/conf/yarn-conf\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "pleased-spirituality",
   "metadata": {},
   "outputs": [],
   "source": [
    "spark = SparkSession \\\n",
    ".builder \\\n",
    ".appName(\"test_test_1\") \\\n",
    ".master('yarn') \\\n",
    ".config(\"spark.serializer\", KryoSerializer.getName) \\\n",
    ".config(\"spark.kryo.registrator\", SedonaKryoRegistrator.getName) \\\n",
    ".config('spark.jars','sedona-core-2.4_2.11-1.0.0-incubating.jar,sedona-sql-2.4_2.11-1.0.0-incubating.jar,sedona-python-adapter-2.4_2.11-1.0.0-incubating.jar,sedona-viz-2.4_2.11-1.0.0-incubating.jar,geotools-wrapper-geotools-24.0.jar') \\\n",
    ".config('spark.executor.memory', '20g') \\\n",
    ".config('spark.driver.memory', '10g') \\\n",
    ".config('spark.sql.shuffle.partitions', 6144) \\\n",
    ".config('spark.executor.instances', '24') \\\n",
    ".config('spark.executor.cores', '5') \\\n",
    ".config('spark.rpc.message.maxSize', '1024') \\\n",
    ".config('spark.yarn.dist.archives', 'environment.tar.gz#environment') \\\n",
    ".getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "greek-peace",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "SedonaRegistrator.registerAll(spark)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "abstract-legislation",
   "metadata": {},
   "source": [
    "## Second, read datasets"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c6cfa58b-b8ed-4a5a-8af9-d5224506f99e",
   "metadata": {},
   "source": [
    "### Spark have different types of input methods, here we read from Hadoop file system, as our Spark 2.4 is built based on Hadoop Yarn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "retired-springfield",
   "metadata": {},
   "outputs": [],
   "source": [
    "l8_df_raw = spark.read.option(\"header\",True).option('inferSchema', True).options(delimiter='|').csv(\"meta_l8_2018_cloud_sep.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "referenced-settle",
   "metadata": {},
   "outputs": [],
   "source": [
    "s2_df_raw = spark.read.option(\"header\",True).option('inferSchema', True).options(delimiter='|').csv(\"meta_s2_2018_cloud_sep.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "intended-edwards",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pr_df_raw = spark.read.option(\"header\",True).option('inferSchema', True).csv(\"meta_pr_2016.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "sunrise-candy",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "155924"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "l8_df_raw.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "relevant-grounds",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "461000"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s2_df_raw.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "olympic-benjamin",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pr_df_raw.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "overhead-musician",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+--------------------+----------+--------------------+--------------------+\n",
      "|_c0|        system:index|cloudcover|           timestamp|                .geo|\n",
      "+---+--------------------+----------+--------------------+--------------------+\n",
      "|  0|LC08_001004_20180706|     58.52|2018-07-06 14:07:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  1|LC08_001004_20180722|     85.11|2018-07-22 14:07:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  2|LC08_001004_20180807|      0.08|2018-08-07 14:07:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  3|LC08_001004_20180908|      1.85|2018-09-08 14:07:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  4|LC08_001005_20180417|      4.23|2018-04-17 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  5|LC08_001005_20180706|     41.22|2018-07-06 14:07:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  6|LC08_001005_20180722|     42.02|2018-07-22 14:07:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  7|LC08_001005_20180807|       0.1|2018-08-07 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  8|LC08_001005_20180823|     66.75|2018-08-23 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  9|LC08_001005_20180908|     22.47|2018-09-08 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 10|LC08_001006_20180401|      3.01|2018-04-01 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 11|LC08_001006_20180417|      4.27|2018-04-17 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 12|LC08_001006_20180519|     36.96|2018-05-19 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 13|LC08_001006_20180604|      0.07|2018-06-04 14:07:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 14|LC08_001006_20180620|      1.37|2018-06-20 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 15|LC08_001006_20180706|      13.4|2018-07-06 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 16|LC08_001006_20180722|     43.27|2018-07-22 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 17|LC08_001006_20180807|      0.23|2018-08-07 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 18|LC08_001006_20180823|     52.62|2018-08-23 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 19|LC08_001006_20180908|     40.37|2018-09-08 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "+---+--------------------+----------+--------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "l8_df_raw.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "integrated-newcastle",
   "metadata": {},
   "outputs": [],
   "source": [
    "# l8_df_raw = l8_df_raw.filter(F.col('cloudcover') <= 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "registered-concept",
   "metadata": {},
   "outputs": [],
   "source": [
    "# l8_df_raw.filter(F.col('cloudcover') == 0).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "expensive-nitrogen",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "155924"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "l8_df_raw.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "public-jamaica",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+--------------------+----------+--------------------+--------------------+\n",
      "|_c0|        system:index|cloudcover|           timestamp|                .geo|\n",
      "+---+--------------------+----------+--------------------+--------------------+\n",
      "|  0|20171226T094359_2...|     9.219|2018-02-02 11:37:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  1|20180101T082329_2...|    98.622|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  2|20180101T082329_2...|     100.0|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  3|20180101T082329_2...|   47.2689|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  4|20180101T082329_2...|   51.0769|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  5|20180101T082329_2...|   76.9772|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  6|20180101T082329_2...|   77.5421|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  7|20180101T082329_2...|   85.8672|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  8|20180101T082329_2...|   37.5433|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  9|20180101T082329_2...|   67.3636|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 10|20180101T082329_2...|   85.0339|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 11|20180101T082329_2...|   17.2088|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 12|20180101T082329_2...|   16.1119|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 13|20180101T082329_2...|    3.9833|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 14|20180101T082329_2...|   73.4103|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 15|20180101T082329_2...|   68.6584|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 16|20180101T082329_2...|   51.4231|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 17|20180101T082329_2...|   28.1423|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 18|20180101T082329_2...|   23.1729|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 19|20180101T082329_2...|    6.5445|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "+---+--------------------+----------+--------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "s2_df_raw.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "specific-cookie",
   "metadata": {},
   "outputs": [],
   "source": [
    "# s2_df_raw = s2_df_raw.filter(F.col('cloudcover') <= 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "suitable-possible",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "461000"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s2_df_raw.count()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "numerous-period",
   "metadata": {},
   "source": [
    "### generate geo columns for all columns, using WKT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "induced-proxy",
   "metadata": {},
   "outputs": [],
   "source": [
    "s2_df_raw = s2_df_raw.withColumnRenamed('.geo', 'footprint')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "diverse-motor",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "def generate_WKT(GEE_coo_str):\n",
    "    if 'LinearRing' not in GEE_coo_str:\n",
    "        print('error geo type')\n",
    "        return 'unknown geo'\n",
    "    if re.search('(\\[\\[.+\\]\\])', GEE_coo_str):\n",
    "        gee_coo0 = re.search('(\\[\\[.+\\]\\])', GEE_coo_str).group(1)\n",
    "        test_com = re.compile('(\\[.+?)(\\,)(.+?\\])')\n",
    "        gee_coos_1 = test_com.sub(r'\\1 \\3', gee_coo0)\n",
    "        gee_coos_2 = gee_coos_1.replace('[', '').replace(']', '').replace(',', ', ')\n",
    "        return 'POLYGON ((' + gee_coos_2 + '))'\n",
    "    else:\n",
    "        return 'unknown format'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "terminal-printing",
   "metadata": {},
   "outputs": [],
   "source": [
    "generate_WKT_udf = F.udf(generate_WKT)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "electoral-municipality",
   "metadata": {},
   "outputs": [],
   "source": [
    "l8_df_raw = l8_df_raw.withColumnRenamed('.geo', 'footprint')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "linear-budget",
   "metadata": {},
   "outputs": [],
   "source": [
    "l8_df_wkt = l8_df_raw.withColumn('WKT_geo', generate_WKT_udf(F.col('footprint')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "sixth-moscow",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+--------------------+----------+--------------------+--------------------+--------------------+\n",
      "|_c0|        system:index|cloudcover|           timestamp|           footprint|             WKT_geo|\n",
      "+---+--------------------+----------+--------------------+--------------------+--------------------+\n",
      "|  0|LC08_001004_20180706|     58.52|2018-07-06 14:07:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-10.924...|\n",
      "|  1|LC08_001004_20180722|     85.11|2018-07-22 14:07:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-16.038...|\n",
      "|  2|LC08_001004_20180807|      0.08|2018-08-07 14:07:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-12.596...|\n",
      "|  3|LC08_001004_20180908|      1.85|2018-09-08 14:07:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-13.429...|\n",
      "|  4|LC08_001005_20180417|      4.23|2018-04-17 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-19.204...|\n",
      "|  5|LC08_001005_20180706|     41.22|2018-07-06 14:07:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-15.823...|\n",
      "|  6|LC08_001005_20180722|     42.02|2018-07-22 14:07:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-20.743...|\n",
      "|  7|LC08_001005_20180807|       0.1|2018-08-07 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-25.727...|\n",
      "|  8|LC08_001005_20180823|     66.75|2018-08-23 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-19.249...|\n",
      "|  9|LC08_001005_20180908|     22.47|2018-09-08 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-19.547...|\n",
      "| 10|LC08_001006_20180401|      3.01|2018-04-01 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-19.492...|\n",
      "| 11|LC08_001006_20180417|      4.27|2018-04-17 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-21.456...|\n",
      "| 12|LC08_001006_20180519|     36.96|2018-05-19 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-29.341...|\n",
      "| 13|LC08_001006_20180604|      0.07|2018-06-04 14:07:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-29.095...|\n",
      "| 14|LC08_001006_20180620|      1.37|2018-06-20 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-22.339...|\n",
      "| 15|LC08_001006_20180706|      13.4|2018-07-06 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-29.103...|\n",
      "| 16|LC08_001006_20180722|     43.27|2018-07-22 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-29.090...|\n",
      "| 17|LC08_001006_20180807|      0.23|2018-08-07 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-19.801...|\n",
      "| 18|LC08_001006_20180823|     52.62|2018-08-23 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-24.993...|\n",
      "| 19|LC08_001006_20180908|     40.37|2018-09-08 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-22.364...|\n",
      "+---+--------------------+----------+--------------------+--------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "l8_df_wkt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "bacterial-patent",
   "metadata": {},
   "outputs": [],
   "source": [
    "s2_df_wkt = s2_df_raw.withColumn('WKT_geo', generate_WKT_udf(F.col('footprint')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "atomic-bahamas",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+--------------------+----------+--------------------+--------------------+--------------------+\n",
      "|_c0|        system:index|cloudcover|           timestamp|           footprint|             WKT_geo|\n",
      "+---+--------------------+----------+--------------------+--------------------+--------------------+\n",
      "|  0|20171226T094359_2...|     9.219|2018-02-02 11:37:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-6.5921...|\n",
      "|  1|20180101T082329_2...|    98.622|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((33.1015...|\n",
      "|  2|20180101T082329_2...|     100.0|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.1301...|\n",
      "|  3|20180101T082329_2...|   47.2689|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.1465...|\n",
      "|  4|20180101T082329_2...|   51.0769|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.1574...|\n",
      "|  5|20180101T082329_2...|   76.9772|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((35.1594...|\n",
      "|  6|20180101T082329_2...|   77.5421|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((35.1689...|\n",
      "|  7|20180101T082329_2...|   85.8672|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.0543...|\n",
      "|  8|20180101T082329_2...|   37.5433|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((35.0638...|\n",
      "|  9|20180101T082329_2...|   67.3636|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((36.0089...|\n",
      "| 10|20180101T082329_2...|   85.0339|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((35.1079...|\n",
      "| 11|20180101T082329_2...|   17.2088|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((33.3628...|\n",
      "| 12|20180101T082329_2...|   16.1119|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.1677...|\n",
      "| 13|20180101T082329_2...|    3.9833|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.1008...|\n",
      "| 14|20180101T082329_2...|   73.4103|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.0647...|\n",
      "| 15|20180101T082329_2...|   68.6584|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.0756...|\n",
      "| 16|20180101T082329_2...|   51.4231|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.0746...|\n",
      "| 17|20180101T082329_2...|   28.1423|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((35.3049...|\n",
      "| 18|20180101T082329_2...|   23.1729|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.6001...|\n",
      "| 19|20180101T082329_2...|    6.5445|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((35.3583...|\n",
      "+---+--------------------+----------+--------------------+--------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "s2_df_wkt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "declared-kernel",
   "metadata": {},
   "outputs": [],
   "source": [
    "# s2_df_raw_pd = s2_df_raw.limit(5).toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "exceptional-focus",
   "metadata": {},
   "outputs": [],
   "source": [
    "# s2_df_raw_pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "harmful-sunrise",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pr_df_wkt = pr_df_raw.withColumn('WKT_geo', generate_WKT_udf(F.col('footprint')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "interracial-million",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "thrown-swiss",
   "metadata": {},
   "outputs": [],
   "source": [
    "s2_df_wkt = s2_df_wkt.withColumnRenamed('system:index', 's2_index')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "diagnostic-nirvana",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- _c0: integer (nullable = true)\n",
      " |-- s2_index: string (nullable = true)\n",
      " |-- cloudcover: double (nullable = true)\n",
      " |-- timestamp: timestamp (nullable = true)\n",
      " |-- footprint: string (nullable = true)\n",
      " |-- WKT_geo: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "s2_df_wkt.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "refined-healthcare",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- _c0: integer (nullable = true)\n",
      " |-- system:index: string (nullable = true)\n",
      " |-- cloudcover: double (nullable = true)\n",
      " |-- timestamp: timestamp (nullable = true)\n",
      " |-- footprint: string (nullable = true)\n",
      " |-- WKT_geo: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "l8_df_wkt.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "stainless-basics",
   "metadata": {},
   "outputs": [],
   "source": [
    "l8_df_wkt = l8_df_wkt.withColumnRenamed('system:index', 'l8_index')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "automotive-numbers",
   "metadata": {},
   "source": [
    "### transfer to Sedona df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "actual-amino",
   "metadata": {},
   "outputs": [],
   "source": [
    "l8_df_wkt.createOrReplaceTempView(\"l8_df_wkt\")\n",
    "l8_df_sedona = spark.sql(\"select ST_GeomFromWKT(WKT_geo) as l8_geometry, l8_index as l8_img_id, timestamp as l8_timestep from l8_df_wkt\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "meaningful-parallel",
   "metadata": {},
   "outputs": [],
   "source": [
    "# calculate the area for the l8\n",
    "\n",
    "l8_df_sedona.createOrReplaceTempView(\"l8_df_sedona\")\n",
    "l8_df_sedona = spark.sql(\"select ST_Area(l8_geometry) as l8_area, * from l8_df_sedona\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "external-analyst",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------------+--------------------+--------------------+--------------------+\n",
      "|           l8_area|         l8_geometry|           l8_img_id|         l8_timestep|\n",
      "+------------------+--------------------+--------------------+--------------------+\n",
      "|14.124304956531168|POLYGON ((-10.924...|LC08_001004_20180706|2018-07-06 14:07:...|\n",
      "|  14.1368958808218|POLYGON ((-16.038...|LC08_001004_20180722|2018-07-22 14:07:...|\n",
      "| 14.13397280878489|POLYGON ((-12.596...|LC08_001004_20180807|2018-08-07 14:07:...|\n",
      "| 14.21783504836524|POLYGON ((-13.429...|LC08_001004_20180908|2018-09-08 14:07:...|\n",
      "|13.013235111990365|POLYGON ((-19.204...|LC08_001005_20180417|2018-04-17 14:08:...|\n",
      "|13.016815047842258|POLYGON ((-15.823...|LC08_001005_20180706|2018-07-06 14:07:...|\n",
      "| 13.04422889786937|POLYGON ((-20.743...|LC08_001005_20180722|2018-07-22 14:07:...|\n",
      "|13.025145367910945|POLYGON ((-25.727...|LC08_001005_20180807|2018-08-07 14:08:...|\n",
      "|13.088991981998086|POLYGON ((-19.249...|LC08_001005_20180823|2018-08-23 14:08:...|\n",
      "|13.036544425947993|POLYGON ((-19.547...|LC08_001005_20180908|2018-09-08 14:08:...|\n",
      "|11.996375308726677|POLYGON ((-19.492...|LC08_001006_20180401|2018-04-01 14:08:...|\n",
      "| 12.04368416424424|POLYGON ((-21.456...|LC08_001006_20180417|2018-04-17 14:08:...|\n",
      "|11.994548598119355|POLYGON ((-29.341...|LC08_001006_20180519|2018-05-19 14:08:...|\n",
      "|11.907405621268417|POLYGON ((-29.095...|LC08_001006_20180604|2018-06-04 14:07:...|\n",
      "|12.011768123945448|POLYGON ((-22.339...|LC08_001006_20180620|2018-06-20 14:08:...|\n",
      "|11.997177863781467|POLYGON ((-29.103...|LC08_001006_20180706|2018-07-06 14:08:...|\n",
      "|12.036923004713366|POLYGON ((-29.090...|LC08_001006_20180722|2018-07-22 14:08:...|\n",
      "|11.964717576376385|POLYGON ((-19.801...|LC08_001006_20180807|2018-08-07 14:08:...|\n",
      "|11.985447360963212|POLYGON ((-24.993...|LC08_001006_20180823|2018-08-23 14:08:...|\n",
      "|11.987771678599081|POLYGON ((-22.364...|LC08_001006_20180908|2018-09-08 14:08:...|\n",
      "+------------------+--------------------+--------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "l8_df_sedona.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "designing-heather",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "155924"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "l8_df_sedona.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "awful-wholesale",
   "metadata": {},
   "outputs": [],
   "source": [
    "l8_df_sedona_month = l8_df_sedona.withColumn('month', F.month(F.col('l8_timestep')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "helpful-melissa",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract data within potential time slots\n",
    "import time\n",
    "import datetime\n",
    "\n",
    "start_timestamp = time.mktime(datetime.datetime.strptime(start_date, \"%Y-%m-%d\").timetuple())\n",
    "start_timestamp = start_timestamp - hour_delay * 3600 # for potential time delay\n",
    "\n",
    "end_timestamp = time.mktime(datetime.datetime.strptime(end_date, \"%Y-%m-%d\").timetuple())\n",
    "end_timestamp = end_timestamp + hour_delay * 3600 # for potential time delay"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "still-plenty",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "33308"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "l8_df_sedona = l8_df_sedona.filter(F.unix_timestamp(F.col('l8_timestep')) <= end_timestamp).filter(F.unix_timestamp(F.col('l8_timestep')) >= start_timestamp)\n",
    "l8_df_sedona.createOrReplaceTempView(\"l8_df_sedona\")\n",
    "\n",
    "## Show the schema of the table\n",
    "spark.table(\"l8_df_sedona\").count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "english-snowboard",
   "metadata": {},
   "outputs": [],
   "source": [
    "s2_df_wkt.createOrReplaceTempView(\"s2_df_wkt\")\n",
    "s2_df_sedona = spark.sql(\"select ST_GeomFromWKT(WKT_geo) as s2_geometry, s2_index as s2_id, timestamp as s2_timestamp from s2_df_wkt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "opening-broadcasting",
   "metadata": {},
   "outputs": [],
   "source": [
    "# calculate the area for the s2\n",
    "\n",
    "s2_df_sedona.createOrReplaceTempView(\"s2_df_sedona\")\n",
    "s2_df_sedona = spark.sql(\"select ST_Area(s2_geometry) as s2_area, * from s2_df_sedona\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "absolute-optimum",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+--------------------+--------------------+\n",
      "|             s2_area|         s2_geometry|               s2_id|        s2_timestamp|\n",
      "+--------------------+--------------------+--------------------+--------------------+\n",
      "|  0.9353367184743338|POLYGON ((-6.5921...|20171226T094359_2...|2018-02-02 11:37:...|\n",
      "|    0.04975974010451|POLYGON ((33.1015...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|0.028138156238690933|POLYGON ((34.1301...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  0.9896346515425708|POLYGON ((34.1465...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  0.8891776385157389|POLYGON ((34.1574...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "| 0.22045620916115177|POLYGON ((35.1594...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  1.1296895785175753|POLYGON ((35.1689...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  1.1403158483431393|POLYGON ((34.0543...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  0.3490307087263577|POLYGON ((35.0638...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  1.0561604589873097|POLYGON ((36.0089...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  1.1389331101552205|POLYGON ((35.1079...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  0.6674468416368683|POLYGON ((33.3628...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  0.4433850315383707|POLYGON ((34.1677...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|   0.216942383203032|POLYGON ((34.1008...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  1.1511821013349017|POLYGON ((34.0647...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  1.1625541701612807|POLYGON ((34.0756...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  1.1741105497465945|POLYGON ((34.0746...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  1.0694739134405127|POLYGON ((35.3049...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  0.8494615591637208|POLYGON ((34.6001...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  0.6255690971259398|POLYGON ((35.3583...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "+--------------------+--------------------+--------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "s2_df_sedona.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "comfortable-favor",
   "metadata": {},
   "outputs": [],
   "source": [
    "s2_df_sedona_month = s2_df_sedona.withColumn('month', F.month(F.col('s2_timestamp')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "blessed-buffalo",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----+--------------+\n",
      "|month|count(s2_area)|\n",
      "+-----+--------------+\n",
      "|    8|         30879|\n",
      "|    6|         30560|\n",
      "|    2|         22818|\n",
      "|    7|         30632|\n",
      "|   12|        156222|\n",
      "|    1|         21309|\n",
      "|    3|         26591|\n",
      "|   11|         22792|\n",
      "|   10|         27812|\n",
      "|    4|         30875|\n",
      "|    9|         29113|\n",
      "|    5|         31397|\n",
      "+-----+--------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "s2_df_sedona_month.groupby('month').agg(F.count('s2_area')).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "occupational-course",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pr_df_wkt.createOrReplaceTempView(\"pr_df_wkt\")\n",
    "# pr_df_sedona = spark.sql(\"select ST_GeomFromWKT(WKT_geo) as pr_geometry, img_id as pr_id, timestep as pr_timestep from pr_df_wkt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "special-brief",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pr_df_sedona.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "mysterious-impression",
   "metadata": {},
   "outputs": [],
   "source": [
    "s2_df_sedona = s2_df_sedona.filter(F.unix_timestamp(F.col('s2_timestamp')) <= end_timestamp).filter(F.unix_timestamp(F.col('s2_timestamp')) >= start_timestamp)\n",
    "s2_df_sedona.createOrReplaceTempView(\"s2_df_sedona\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "pressing-algebra",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "69679"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s2_df_sedona.count()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "departmental-notification",
   "metadata": {},
   "source": [
    "### prepare timestamps for temporal join"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "bored-quilt",
   "metadata": {},
   "outputs": [],
   "source": [
    "l8_df_sedona = l8_df_sedona.withColumn('l8_timestamp_date', F.to_timestamp(F.col('l8_timestep'), 'yyyy-MM-dd HH:mm:ss'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "crazy-rescue",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------------+--------------------+--------------------+--------------------+-------------------+\n",
      "|           l8_area|         l8_geometry|           l8_img_id|         l8_timestep|  l8_timestamp_date|\n",
      "+------------------+--------------------+--------------------+--------------------+-------------------+\n",
      "|10.260300388817239|POLYGON ((-31.595...|LC08_001008_20180316|2018-03-16 14:09:...|2018-03-16 14:09:31|\n",
      "| 7.144768594030483|POLYGON ((-40.345...|LC08_001014_20180316|2018-03-16 14:11:...|2018-03-16 14:11:55|\n",
      "| 6.442331433563663|POLYGON ((-43.034...|LC08_001016_20180316|2018-03-16 14:12:...|2018-03-16 14:12:42|\n",
      "| 6.172094051092833|POLYGON ((-44.464...|LC08_001017_20180316|2018-03-16 14:13:...|2018-03-16 14:13:06|\n",
      "| 5.933376832674818|POLYGON ((-44.946...|LC08_001018_20180212|2018-02-12 14:13:...|2018-02-12 14:13:46|\n",
      "|5.9307340839153655|POLYGON ((-46.918...|LC08_001018_20180316|2018-03-16 14:13:...|2018-03-16 14:13:30|\n",
      "| 4.367998283490747|POLYGON ((-51.729...|LC08_001027_20180111|2018-01-11 14:17:...|2018-01-11 14:17:37|\n",
      "| 4.362289170929774|POLYGON ((-51.245...|LC08_001027_20180127|2018-01-27 14:17:...|2018-01-27 14:17:28|\n",
      "| 4.370303065095724|POLYGON ((-53.820...|LC08_001027_20180316|2018-03-16 14:17:...|2018-03-16 14:17:05|\n",
      "| 4.261514576021454|POLYGON ((-54.323...|LC08_001028_20180111|2018-01-11 14:18:...|2018-01-11 14:18:01|\n",
      "| 4.255227124093869|POLYGON ((-51.520...|LC08_001028_20180127|2018-01-27 14:17:...|2018-01-27 14:17:52|\n",
      "|4.2559194353723555|POLYGON ((-51.645...|LC08_001028_20180316|2018-03-16 14:17:...|2018-03-16 14:17:29|\n",
      "| 3.102463690286044|POLYGON ((-60.157...|LC08_001048_20180127|2018-01-27 14:25:...|2018-01-27 14:25:49|\n",
      "|3.1048898210153912|POLYGON ((-59.797...|LC08_001048_20180212|2018-02-12 14:25:...|2018-02-12 14:25:43|\n",
      "|3.1035103815509286|POLYGON ((-60.153...|LC08_001048_20180228|2018-02-28 14:25:...|2018-02-28 14:25:36|\n",
      "|3.1035357291695447|POLYGON ((-60.153...|LC08_001048_20180316|2018-03-16 14:25:...|2018-03-16 14:25:27|\n",
      "|3.0793366334224244|POLYGON ((-62.186...|LC08_001049_20180111|2018-01-11 14:26:...|2018-01-11 14:26:22|\n",
      "|3.0791311888339417|POLYGON ((-61.783...|LC08_001049_20180127|2018-01-27 14:26:...|2018-01-27 14:26:13|\n",
      "|3.0804081980587523|POLYGON ((-60.485...|LC08_001049_20180212|2018-02-12 14:26:...|2018-02-12 14:26:07|\n",
      "|3.0806823301902426|POLYGON ((-60.276...|LC08_001049_20180228|2018-02-28 14:26:...|2018-02-28 14:26:00|\n",
      "+------------------+--------------------+--------------------+--------------------+-------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "l8_df_sedona.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "cognitive-subcommittee",
   "metadata": {},
   "outputs": [],
   "source": [
    "s2_df_sedona = s2_df_sedona.withColumn('s2_timestamp_date', F.to_timestamp(F.col('s2_timestamp'), 'yyyy-MM-dd HH:mm:ss'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "successful-arctic",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+--------------------+--------------------+-------------------+\n",
      "|             s2_area|         s2_geometry|               s2_id|        s2_timestamp|  s2_timestamp_date|\n",
      "+--------------------+--------------------+--------------------+--------------------+-------------------+\n",
      "|  0.9353367184743338|POLYGON ((-6.5921...|20171226T094359_2...|2018-02-02 11:37:...|2018-02-02 11:37:55|\n",
      "|    0.04975974010451|POLYGON ((33.1015...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|0.028138156238690933|POLYGON ((34.1301...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  0.9896346515425708|POLYGON ((34.1465...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  0.8891776385157389|POLYGON ((34.1574...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "| 0.22045620916115177|POLYGON ((35.1594...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  1.1296895785175753|POLYGON ((35.1689...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  1.1403158483431393|POLYGON ((34.0543...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  0.3490307087263577|POLYGON ((35.0638...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  1.0561604589873097|POLYGON ((36.0089...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  1.1389331101552205|POLYGON ((35.1079...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  0.6674468416368683|POLYGON ((33.3628...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  0.4433850315383707|POLYGON ((34.1677...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|   0.216942383203032|POLYGON ((34.1008...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  1.1511821013349017|POLYGON ((34.0647...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  1.1625541701612807|POLYGON ((34.0756...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  1.1741105497465945|POLYGON ((34.0746...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  1.0694739134405127|POLYGON ((35.3049...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  0.8494615591637208|POLYGON ((34.6001...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  0.6255690971259398|POLYGON ((35.3583...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "+--------------------+--------------------+--------------------+--------------------+-------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "s2_df_sedona.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "severe-phase",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1514782800.0"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# try to generate a temporal index\n",
    "\n",
    "# find start timestamp\n",
    "import datetime\n",
    "import time\n",
    " \n",
    "# assigned regular string date\n",
    "start_date_datetype = datetime.datetime(2018, 1, 1, 0, 0)\n",
    "# print(\"date_time =>\",date_time)\n",
    " \n",
    "start_date_timestamp = time.mktime(start_date_datetype.timetuple())\n",
    "start_date_timestamp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "operational-chapel",
   "metadata": {},
   "outputs": [],
   "source": [
    "# get time_duration and hour_delay into seconds for comparison\n",
    "time_index_hour_length_second = time_index_hour_length * 3600\n",
    "hour_delay_second = hour_delay * 3600"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "supported-cinema",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "86400"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "time_index_hour_length_second"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "handed-instrument",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentinel_sedona_time_index = s2_df_sedona.withColumn('time_index', F.floor((F.unix_timestamp('s2_timestamp_date') - F.lit(start_date_timestamp)) / F.lit(time_index_hour_length_second)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "hidden-roulette",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+--------------------+--------------------+-------------------+----------+\n",
      "|             s2_area|         s2_geometry|               s2_id|        s2_timestamp|  s2_timestamp_date|time_index|\n",
      "+--------------------+--------------------+--------------------+--------------------+-------------------+----------+\n",
      "|  0.9353367184743338|POLYGON ((-6.5921...|20171226T094359_2...|2018-02-02 11:37:...|2018-02-02 11:37:55|        32|\n",
      "|    0.04975974010451|POLYGON ((33.1015...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|0.028138156238690933|POLYGON ((34.1301...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  0.9896346515425708|POLYGON ((34.1465...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  0.8891776385157389|POLYGON ((34.1574...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "| 0.22045620916115177|POLYGON ((35.1594...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  1.1296895785175753|POLYGON ((35.1689...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  1.1403158483431393|POLYGON ((34.0543...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  0.3490307087263577|POLYGON ((35.0638...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  1.0561604589873097|POLYGON ((36.0089...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  1.1389331101552205|POLYGON ((35.1079...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  0.6674468416368683|POLYGON ((33.3628...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  0.4433850315383707|POLYGON ((34.1677...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|   0.216942383203032|POLYGON ((34.1008...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  1.1511821013349017|POLYGON ((34.0647...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  1.1625541701612807|POLYGON ((34.0756...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  1.1741105497465945|POLYGON ((34.0746...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  1.0694739134405127|POLYGON ((35.3049...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  0.8494615591637208|POLYGON ((34.6001...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  0.6255690971259398|POLYGON ((35.3583...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "+--------------------+--------------------+--------------------+--------------------+-------------------+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_sedona_time_index.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "african-knife",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentinel_sedona_time_index_early = sentinel_sedona_time_index.filter(((F.unix_timestamp('s2_timestamp_date') - F.lit(start_date_timestamp)) % F.lit(time_index_hour_length_second)) <= hour_delay_second)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "upper-pharmaceutical",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+-----------+-----+------------+-----------------+----------+\n",
      "|s2_area|s2_geometry|s2_id|s2_timestamp|s2_timestamp_date|time_index|\n",
      "+-------+-----------+-----+------------+-----------------+----------+\n",
      "+-------+-----------+-----+------------+-----------------+----------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_sedona_time_index_early.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "innovative-petroleum",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "advanced-ireland",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentinel_sedona_time_index_late = sentinel_sedona_time_index.filter(((F.unix_timestamp('s2_timestamp_date') - F.lit(start_date_timestamp)) % F.lit(time_index_hour_length_second)) >= (time_index_hour_length_second - hour_delay_second))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "welsh-serum",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+-----------+-----+------------+-----------------+----------+\n",
      "|s2_area|s2_geometry|s2_id|s2_timestamp|s2_timestamp_date|time_index|\n",
      "+-------+-----------+-----+------------+-----------------+----------+\n",
      "+-------+-----------+-----+------------+-----------------+----------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_sedona_time_index_late.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "female-comparative",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "special-retrieval",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentinel_sedona_dateforjoin = sentinel_sedona_time_index.withColumn('time_index_join', F.col('time_index')).union(sentinel_sedona_time_index_late.withColumn('time_index_join', (F.col('time_index') + 1))).union(sentinel_sedona_time_index_early.withColumn('time_index_join', (F.col('time_index') - 1)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "greatest-conviction",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+--------------------+--------------------+-------------------+----------+---------------+\n",
      "|             s2_area|         s2_geometry|               s2_id|        s2_timestamp|  s2_timestamp_date|time_index|time_index_join|\n",
      "+--------------------+--------------------+--------------------+--------------------+-------------------+----------+---------------+\n",
      "|  0.9353367184743338|POLYGON ((-6.5921...|20171226T094359_2...|2018-02-02 11:37:...|2018-02-02 11:37:55|        32|             32|\n",
      "|    0.04975974010451|POLYGON ((33.1015...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|0.028138156238690933|POLYGON ((34.1301...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  0.9896346515425708|POLYGON ((34.1465...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  0.8891776385157389|POLYGON ((34.1574...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "| 0.22045620916115177|POLYGON ((35.1594...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  1.1296895785175753|POLYGON ((35.1689...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  1.1403158483431393|POLYGON ((34.0543...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  0.3490307087263577|POLYGON ((35.0638...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  1.0561604589873097|POLYGON ((36.0089...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  1.1389331101552205|POLYGON ((35.1079...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  0.6674468416368683|POLYGON ((33.3628...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  0.4433850315383707|POLYGON ((34.1677...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|   0.216942383203032|POLYGON ((34.1008...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  1.1511821013349017|POLYGON ((34.0647...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  1.1625541701612807|POLYGON ((34.0756...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  1.1741105497465945|POLYGON ((34.0746...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  1.0694739134405127|POLYGON ((35.3049...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  0.8494615591637208|POLYGON ((34.6001...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  0.6255690971259398|POLYGON ((35.3583...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "+--------------------+--------------------+--------------------+--------------------+-------------------+----------+---------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_sedona_dateforjoin.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "reflected-rotation",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "69679"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentinel_sedona_dateforjoin.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "usual-brave",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "69679"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentinel_sedona_dateforjoin.drop_duplicates(subset=['s2_id']).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "modular-timeline",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "appointed-greeting",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "artistic-associate",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "posted-linux",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "green-concentrate",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "boxed-seeking",
   "metadata": {},
   "source": [
    "## Third, spatio-temporal join Sentinel dataset and L8 dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "id": "round-process",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3.1 join the two Sedona dataframe according the timestamps, hour level at now, consider time delay"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "id": "chinese-cedar",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentinel_sedona_dateforjoin_maxmin = sentinel_sedona_dateforjoin.withColumn('sentinel_max_timestamp', (F.unix_timestamp(\"s2_timestamp_date\") + hour_delay * 3600)).\\\n",
    "withColumn('sentinel_min_timestamp', (F.unix_timestamp(\"s2_timestamp_date\") - hour_delay * 3600))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "id": "sharing-framing",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+--------------------+--------------------+-------------------+----------+---------------+----------------------+----------------------+\n",
      "|             s2_area|         s2_geometry|               s2_id|        s2_timestamp|  s2_timestamp_date|time_index|time_index_join|sentinel_max_timestamp|sentinel_min_timestamp|\n",
      "+--------------------+--------------------+--------------------+--------------------+-------------------+----------+---------------+----------------------+----------------------+\n",
      "|  0.9353367184743338|POLYGON ((-6.5921...|20171226T094359_2...|2018-02-02 11:37:...|2018-02-02 11:37:55|        32|             32|            1517611075|            1517567875|\n",
      "|    0.04975974010451|POLYGON ((33.1015...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|0.028138156238690933|POLYGON ((34.1301...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  0.9896346515425708|POLYGON ((34.1465...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  0.8891776385157389|POLYGON ((34.1574...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "| 0.22045620916115177|POLYGON ((35.1594...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  1.1296895785175753|POLYGON ((35.1689...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  1.1403158483431393|POLYGON ((34.0543...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  0.3490307087263577|POLYGON ((35.0638...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  1.0561604589873097|POLYGON ((36.0089...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  1.1389331101552205|POLYGON ((35.1079...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  0.6674468416368683|POLYGON ((33.3628...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  0.4433850315383707|POLYGON ((34.1677...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|   0.216942383203032|POLYGON ((34.1008...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  1.1511821013349017|POLYGON ((34.0647...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  1.1625541701612807|POLYGON ((34.0756...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  1.1741105497465945|POLYGON ((34.0746...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  1.0694739134405127|POLYGON ((35.3049...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  0.8494615591637208|POLYGON ((34.6001...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  0.6255690971259398|POLYGON ((35.3583...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "+--------------------+--------------------+--------------------+--------------------+-------------------+----------+---------------+----------------------+----------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_sedona_dateforjoin_maxmin.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fourth-passenger",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "id": "northern-british",
   "metadata": {},
   "outputs": [],
   "source": [
    "l8_df_sedona = l8_df_sedona.withColumn('time_index_join', F.floor((F.unix_timestamp('l8_timestamp_date') - F.lit(start_date_timestamp)) / F.lit(time_index_hour_length_second)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "id": "acknowledged-mirror",
   "metadata": {},
   "outputs": [],
   "source": [
    "l8_df_sedona = l8_df_sedona.withColumn('l8_timestamp', F.unix_timestamp(F.col('l8_timestamp_date')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "id": "automatic-alias",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- l8_area: double (nullable = false)\n",
      " |-- l8_geometry: geometry (nullable = false)\n",
      " |-- l8_img_id: string (nullable = true)\n",
      " |-- l8_timestep: timestamp (nullable = true)\n",
      " |-- l8_timestamp_date: timestamp (nullable = true)\n",
      " |-- time_index_join: long (nullable = true)\n",
      " |-- l8_timestamp: long (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "l8_df_sedona.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "id": "flush-guidance",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- s2_area: double (nullable = false)\n",
      " |-- s2_geometry: geometry (nullable = false)\n",
      " |-- s2_id: string (nullable = true)\n",
      " |-- s2_timestamp: timestamp (nullable = true)\n",
      " |-- s2_timestamp_date: timestamp (nullable = true)\n",
      " |-- time_index: long (nullable = true)\n",
      " |-- time_index_join: long (nullable = true)\n",
      " |-- sentinel_max_timestamp: long (nullable = true)\n",
      " |-- sentinel_min_timestamp: long (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_sedona_dateforjoin_maxmin.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "id": "first-monte",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentinel_sedona_dateforjoin_maxmin.createOrReplaceTempView(\"sentinel_sedona_dateforjoin_maxmin\")\n",
    "l8_df_sedona.createOrReplaceTempView(\"l8_df_sedona\")\n",
    "test_temporal_indexed_spatial_join = spark.sql(\"SELECT * FROM sentinel_sedona_dateforjoin_maxmin, l8_df_sedona WHERE ST_Intersects(sentinel_sedona_dateforjoin_maxmin.s2_geometry, l8_df_sedona.l8_geometry) AND sentinel_sedona_dateforjoin_maxmin.time_index_join = l8_df_sedona.time_index_join\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "challenging-identification",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_temporal_indexed_spatial_join.explain()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "id": "configured-franklin",
   "metadata": {},
   "outputs": [],
   "source": [
    "# import time\n",
    "# start_spatial_time = time.time()\n",
    "# test_spatial_join.select('uuid', 'Description', 'is_timestamp').write.format('com.databricks.spark.csv')\\\n",
    "#         .mode(\"overwrite\")\\\n",
    "#         .option('header', True)\\\n",
    "#         .save('sentinel_is2_intersect_pure_spatial')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "id": "pretty-vulnerability",
   "metadata": {},
   "outputs": [],
   "source": [
    "# with open('speed_pure_spatial_join_month.txt', 'w') as f:\n",
    "#     f.write('pure spatial join time:')\n",
    "#     f.write(str(time.time() - start_spatial_time))\n",
    "#     f.write('/n')\n",
    "#     f.write('number of spatial intersections:')\n",
    "#     f.write(str(test_spatial_join.count()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "id": "elect-banner",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_temporal_indexed_spatial_join.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "id": "informative-journalist",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_ST_join = test_temporal_indexed_spatial_join.filter(F.col('sentinel_max_timestamp') >= F.col('l8_timestamp')).filter(F.col('sentinel_min_timestamp') <= F.col('l8_timestamp'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "explicit-enough",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_ST_join.explain()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "id": "rotary-vacation",
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "start_temporal_time = time.time()\n",
    "test_ST_join.select('s2_id', 'l8_img_id', 's2_timestamp', 'l8_timestamp').write.format('com.databricks.spark.csv')\\\n",
    "        .mode(\"overwrite\")\\\n",
    "        .option('header', True)\\\n",
    "        .save('sentinel_L8_intersect_spatial_temporal')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "graphic-finish",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('STSI_join_{}_months_{}_hour_delay_S2L8.txt'.format(str(duration_month), str(hour_delay)), 'w') as f:\n",
    "    f.write('STSI time in total:')\n",
    "    f.write(str(time.time() - start_temporal_time))\n",
    "    f.write('\\n')\n",
    "    f.write('number of spatial intersections:')\n",
    "    f.write(str(test_ST_join.count()))\n",
    "    f.write('\\n')\n",
    "    f.write('from and to date')\n",
    "    f.write(start_date)\n",
    "    f.write('\\n')\n",
    "    f.write(end_date)\n",
    "    f.write('\\n')\n",
    "    f.write('L8 data count:')\n",
    "    f.write(str(l8_df_sedona.count()))\n",
    "    f.write('\\n')\n",
    "\n",
    "    f.write('Sentinel 2 data count:')\n",
    "    f.write(str(s2_df_sedona.count()))\n",
    "    f.write('\\n')\n",
    "    f.write('hour_delay:')\n",
    "    f.write(str(hour_delay))\n",
    "    \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "combined-stylus",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_ST_join.select('s2_index', 'Description', 'is_timestamp').drop_duplicates(subset=['s2_index', 'Description']).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "military-simpson",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "id": "worse-mentor",
   "metadata": {},
   "outputs": [],
   "source": [
    "# \"fhldsai{}_{}\".format(str(2), 'se')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "id": "color-deficit",
   "metadata": {},
   "outputs": [],
   "source": [
    "# boundaries in the format of polygon）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "nasty-investor",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_ST_join_pd = test_ST_join.toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "hawaiian-thomson",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_ST_join_pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "id": "basic-lease",
   "metadata": {},
   "outputs": [],
   "source": [
    "# gpd.GeoSeries(test_ST_join_area_pd['geometry']).buffer(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "id": "hydraulic-spectrum",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_ST_join_pd['intersection'] = gpd.GeoSeries(test_ST_join_pd['s2_geometry']).buffer(0).intersection(gpd.GeoSeries(test_ST_join_pd['l8_geometry']).buffer(0), align=False)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "id": "mexican-prisoner",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_ST_join_pd['intersection_area'] = gpd.GeoSeries(test_ST_join_pd['intersection']).area\n",
    "# test_ST_join_pd['s2_pd_area'] = gpd.GeoSeries(test_ST_join_pd['s2_geometry']).area\n",
    "# test_ST_join_pd['l8_pd_area'] = gpd.GeoSeries(test_ST_join_pd['l8_geometry']).area"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "id": "touched-glance",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_ST_join_pd['intersection_percentage'] = test_ST_join_pd['intersection_area'] / test_ST_join_pd[['s2_pd_area', 's2_pd_area']].min(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "affecting-fiction",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_ST_join_pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "empirical-cancer",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_ST_join_pd.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "similar-sacramento",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_ST_join_pd[test_ST_join_pd['intersection_percentage'] > 0.5].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "disciplinary-diameter",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 78130 / 364922"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "handmade-viking",
   "metadata": {},
   "outputs": [],
   "source": [
    "# s2_l8_ST_join_valid_true_pd[s2_l8_ST_join_valid_true_pd['intersection_area'] / s2_l8_ST_join_valid_true_pd['s2_pd_area'] > 0.9]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "stuck-gossip",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_area_intersect = s2_l8_ST_join_valid_true_pd[s2_l8_ST_join_valid_true_pd['intersection_area'] / s2_l8_ST_join_valid_true_pd['s2_pd_area'] > 0.9]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "id": "owned-webcam",
   "metadata": {},
   "outputs": [],
   "source": [
    "# with open('ICESpark_temporal_indexed_join_{}_months_{}_hour_delay_S2L8.txt'.format(str(duration_month), str(hour_delay)), 'a+') as f:\n",
    "#     f.write('percentage of intersection area larger than 50% of the smaller RS image')\n",
    "#     f.write('\\n')\n",
    "#     f.write(str(test_ST_join_pd[test_ST_join_pd['intersection_percentage'] > 0.5].shape[0] / test_ST_join_pd.shape[0]))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "pressed-musician",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
